import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import scipy.stats as sci
import plotly.express as px
pd.options.mode.chained_assignment = None
The motivation for this project is that breast cancer is the second leading cause of death of women in America according to cancer.gov. Breast cancer can be either benign or malignant. For a benign cancer, a woman may be able to live a healthy life without needing to worry about the cancer, however a malignant cancer may be something that must be taken with urgency.
For that reason, we choose a breast cancer dataset provided from Kaggle.
breast_cancer_df = pd.read_csv('data.csv')
breast_cancer_df.head(5)
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
5 rows × 33 columns
For cleaning, the column labeled "Unamed: 32" is removed because it is filled with missing values. The "id" column is also deleted as the id's are not needed for the analysis.
breast_cancer_df = breast_cancer_df.drop(labels="id", axis=1)
breast_cancer_df = breast_cancer_df.drop(labels="Unnamed: 32", axis=1)
breast_cancer_df = breast_cancer_df.dropna(axis=1)
breast_cancer_df.head(5)
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
| 1 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
| 2 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
| 3 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
| 4 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 31 columns
A quick summary of the numeric variables in the dataset
summary = breast_cancer_df.describe().loc[['mean', '50%', 'std', 'min', 'max'], :]
summary.rename(index={'mean':'Mean','50%': 'Median', 'std': 'Standard Deviation', 'min':'Min', 'max':'Max'}, inplace=True)
summary
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 | ... | 16.269190 | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 |
| Median | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | 0.061540 | ... | 14.970000 | 25.410000 | 97.660000 | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 |
| Standard Deviation | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 | ... | 4.833242 | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 |
| Min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 | ... | 7.930000 | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 |
| Max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 | ... | 36.040000 | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 |
5 rows × 30 columns
The bar plot below shows that the dataset has more cases with beningn (B) tumour than malignant (m) tumours.
target_plot = sns.countplot(data=breast_cancer_df, x='diagnosis', hue='diagnosis')
target_plot.set(title="Barplot of Diagnosis" )
target_plot.bar_label(target_plot.containers[0])
target_plot.bar_label(target_plot.containers[1])
plt.show()
graph = sns.relplot(data=breast_cancer_df, x="radius_mean",y="area_mean", hue="diagnosis").set(title="How Average size of Tumor affects Serverity of Cancer")
graph.map(sns.regplot, "radius_mean","area_mean",ci=0, scatter=False, color='black')
graph.set_xlabels("Average Radius")
graph.set_ylabels("Average Area")
<seaborn.axisgrid.FacetGrid at 0x7fea98d52e80>
The first plot shows the relationship ship between average radius and average area. It can clearly be deduced that the more the radius and the more the area of a tumor is the higher the chances of the tumor being malignant. This graph also pointed out that radius and area go hand in hand so in the later graphs we will mostly only refer to area as it will take radius into account anyways.
graph1 = sns.relplot(data=breast_cancer_df, x="compactness_mean",y="concavity_mean", hue="diagnosis").set(title="How Average Concavity and Compactness of Tumor affects severity of Cancer")
graph1.map(sns.regplot, "compactness_mean","concavity_mean",ci=0, scatter=False, color='black')
graph1.set_xlabels("Average Compactness")
graph1.set_ylabels("Average Concavity")
<seaborn.axisgrid.FacetGrid at 0x7fea9947e048>
The second plot shows the relationship between average concavity and average compactness. We felt like these two factors described the overalll shape and hardness of the tumor and accoriding to the linear regression, the more the compact and concave the tumor is the more changes of it being malignant.
graph2 = sns.relplot(data=breast_cancer_df, x="smoothness_mean",y="texture_mean", hue="diagnosis").set(title="How Average Texure and Smoothness of Tumor affects severity of Cancer")
graph2.map(sns.regplot, "smoothness_mean","texture_mean",ci=0, scatter=False, color='black')
graph2.set_xlabels("Average Smoothness")
graph2.set_ylabels("Average Texture")
<seaborn.axisgrid.FacetGrid at 0x7fea99ceeda0>
This third plot shows the relationship between average texture and average smoothness of a tumor. Basing our analysis off of the scatter plot and linear regression line there is an obvious relationship between these two factors.
M_data = breast_cancer_df.loc[breast_cancer_df["diagnosis"]=="M"]
B_data = breast_cancer_df.loc[breast_cancer_df["diagnosis"]=="B"]
colors = ["green","purple"]
fig, axs = plt.subplots(1,3)
fig.suptitle("Analysis of Average Area, Compactness, and Concavity")
b1 = axs[0].boxplot([B_data["area_mean"],M_data["area_mean"]], patch_artist=True, labels=["Benign","Malignant"])
axs[0].set_xlabel("Average Area")
for patch, color in zip(b1['boxes'], colors):
patch.set_facecolor(color)
b2 = axs[1].boxplot([B_data["compactness_mean"],M_data["compactness_mean"]], patch_artist=True, labels=["Benign","Malignant"])
axs[1].set_xlabel("Average Compactness")
for patch, color in zip(b2['boxes'], colors):
patch.set_facecolor(color)
b3 = axs[2].boxplot([B_data["concavity_mean"],M_data["concavity_mean"]], patch_artist=True, labels=["Benign","Malignant"])
axs[2].set_xlabel("Average Concavity")
for patch, color in zip(b3['boxes'], colors):
patch.set_facecolor(color)
fig.legend([b1["boxes"][0], b1["boxes"][1]],["Bengin","Malignant"],bbox_to_anchor=(1.04,1), loc="upper left")
fig.tight_layout()
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
We created a side-by-side boxplot comparing the average area, average compactness, and average concavity of the tumor separated by whether the tumor was diagnosed as bengin or malignant. We were able to confirm that all malignant tumors have a significantly higher average area, average compactness, and average concavity.
This final 3D graph is to visualize the relationship between average area, average compactness, and average concavity. It proves that the higer the area, compactness and concavity of a tumor, the higher the chances of that tumor being malignant.
fig = px.scatter_3d(breast_cancer_df, x='area_mean', y='compactness_mean',z='concavity_mean', color='diagnosis', symbol='diagnosis', size_max=18, opacity=0.8)
fig.update_layout(title = "Average Mean vs Compactness vs Concavity",scene = dict(
xaxis_title='Average Mean',
yaxis_title='Average Compactness',
zaxis_title='Average Concavity'),)
fig.show(renderer='notebook')
This final 3D graph is to visualize the relationship between average area, average compactness, and average concavity. It proves that the higer the area, compactness and concavity of a tumor, the higher the chances of that tumor being malignant.
To make it easier for the model to learn, we are going to change B to 0 and M to 1 so that way the model is able to differentiate from a benign cancer and malignant
breast_cancer_df_model = breast_cancer_df[["diagnosis","radius_mean", "texture_mean", "area_mean", "compactness_mean", "concavity_mean", "smoothness_mean"]]
for index, row in breast_cancer_df_model.iterrows():
if row.diagnosis == "B":
breast_cancer_df_model.loc[index, "diagnosis"] = 0
else:
breast_cancer_df_model.loc[index, "diagnosis"] = 1
We are going to split the data into two part, one with the independent variables which include our training points that we feel is best for indicating the type of cancer. The dependent variable, the type of cancer will be separated so we can use it for training and checking for whether or model is accurate
To build the model using K-Nearest Neighbors (KNN), the data is first sliced into the independent and dependent variables. Then the data is split into the test and train sets. For this project 75% of the data is used to train the model, while the remaining 25% is used to test the model. Feature scaling (normalization) is applied since KNN is a distance based model.
X = breast_cancer_df_model.iloc[:, 1:].values
y = breast_cancer_df_model.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
norm = MinMaxScaler()
X_norm_train = norm.fit_transform(X_train)
X_norm_test = norm.fit_transform(X_test)
We choose 75% as training data, 25% is testing data. Since we have 425 samples or so, square root of 425 is about 20.5. We initially choose 20 and 21 and our number of neighbors. After choosing our neighbors, we made our two models.
knn_20 = KNeighborsClassifier(n_neighbors=20)
knn_21 = KNeighborsClassifier(n_neighbors=21)
y_train = y_train.astype('int')
knn_20.fit(X_train, y_train)
knn_21.fit(X_train, y_train)
type(knn_20)
sklearn.neighbors._classification.KNeighborsClassifier
After creating the models, we tested the 2 models using the remaining 25% of data. This data does not show the type of cancer, our model will output a array of either 0 or 1 indicating the type of predicted cancer.
knn_20_predict = knn_20.predict(X_test)
knn_21_predict = knn_21.predict(X_test)
We found that both models have an accuracy of 88%. We made a confusion matrix and plot that indicates whether the cancer was correctly identified along with any mistakes such as false positive and false negative.
y_test = y_test.astype('int')
cmatrix_20 = confusion_matrix(y_test, knn_20_predict)
acscore_20= accuracy_score(y_test, knn_20_predict)
cmatrix_21 = confusion_matrix(y_test, knn_21_predict)
acscore_21= accuracy_score(y_test, knn_21_predict)
print(f"The KNN model where k = 20 has an accuracy rate of {round(acscore_20*100, 2)}%")
print(f"The KNN model where k = 21 has an accuracy rate of {round(acscore_21*100, 2)}%")
fig, ax =plt.subplots(1,2, figsize = (10,5))
sns.heatmap(cmatrix_20, annot=True, cbar=False, cmap=plt.cm.Blues, ax=ax[0])
sns.heatmap(cmatrix_21, annot=True, cbar=False, cmap=plt.cm.Blues, ax=ax[1])
ax[0].set(title = "K-value = 20")
ax[1].set(title = "K-value = 21")
fig.suptitle("Confusion Matrix Plot")
fig.supxlabel('Predicted Label')
fig.supylabel('True Label')
plt.show()
The KNN model where k = 20 has an accuracy rate of 88.81% The KNN model where k = 21 has an accuracy rate of 88.81%
Looking at the accuracy rate and confusion matrix for the models, K-value at 20 and 21 produce the same results- 88.81% accuracy. Ploting the means of error at each Kvalue could be helpful in creating a better model. Since our models we very quick to train and test. We decided to see whether the number of neighbors from 1 to 30 had a better accuracy.
error = []
for i in range(1, 30):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
error.append(np.mean(y_pred != y_test))
plt.figure(figsize=(10, 5))
plt.plot(range(1, 30), error, linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs K Value')
plt.xlabel('K Value')
plt.ylabel('Error Rate')
plt.show()
From the graph above, we can see that there is a lower error rate when K-value is equal to 6.
knn_6 = KNeighborsClassifier(n_neighbors=6)
knn_6.fit(X_train, y_train)
knn_6_predict = knn_6.predict(X_test)
cmatrix_6= confusion_matrix(y_test, knn_6_predict)
acscore_6= accuracy_score(y_test, knn_6_predict)
print(f"The KNN model where k = 6 has an accuracy rate of {round(acscore_6*100, 2)}%")
c_6 = sns.heatmap(cmatrix_6, annot=True, cbar=False, cmap=plt.cm.Blues)
c_6.set_title("Confusion Matrix Plot")
c_6.set_xlabel('Predicted Label')
c_6.set_ylabel('True Label')
plt.show()
The KNN model where k = 6 has an accuracy rate of 89.51%